In [1]:
import copy
import cPickle
import datetime as dt
import glob
import os
import re
import subprocess
import urllib2
import cdpybio as cpb
from ipyparallel import Client
from scipy.stats import fisher_exact
import matplotlib.gridspec as gridspec
from matplotlib.patches import Rectangle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyencodetools as pet
import pybedtools as pbt
import scipy
import scipy.stats as stats
import seaborn as sns
import socket
import statsmodels.stats.multitest as smm
import vcf as pyvcf
import cardipspy as cpy
import ciepy
%matplotlib inline
%load_ext rpy2.ipython
dy_name = 'rare_variant_analysis'
import socket
if socket.gethostname() == 'fl-hn1' or socket.gethostname() == 'fl-hn2':
dy = os.path.join(ciepy.root, 'sandbox', 'tmp', dy_name)
cpy.makedir(dy)
pbt.set_tempdir(dy)
outdir = os.path.join(ciepy.root, 'output', dy_name)
cpy.makedir(outdir)
private_outdir = os.path.join(ciepy.root, 'private_output', dy_name)
cpy.makedir(private_outdir)
In [2]:
sns.set_style('whitegrid')
In [3]:
tg = pd.read_table(cpy.gencode_transcript_gene, index_col=0,
header=None, squeeze=True)
gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)
genes = pbt.BedTool(cpy.gencode_gene_bed)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'rsem_tpm.tsv')
tpm = pd.read_table(fn, index_col=0)
log_tpm = np.log10(tpm + 1)
cnvs = pd.read_table(os.path.join(ciepy.root, 'output', 'input_data',
'cnvs.tsv'), index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'wgs_metadata.tsv')
wgs_meta = pd.read_table(fn, index_col=0, squeeze=True)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'rnaseq_metadata.tsv')
rna_meta = pd.read_table(fn, index_col=0)
rna_meta_eqtl = rna_meta[rna_meta.in_eqtl]
fn = os.path.join(ciepy.root, 'output', 'input_data', 'subject_metadata.tsv')
subject_meta = pd.read_table(fn, index_col=0)
rna_meta = rna_meta.merge(subject_meta, left_on='subject_id', right_index=True)
fn = os.path.join(os.path.split(cpy.roadmap_15_state_annotation)[0], 'EIDlegend.txt')
roadmap_ids = pd.read_table(fn, squeeze=True, index_col=0, header=None)
fn = os.path.join(ciepy.root, 'output', 'eqtl_input', 'unrelateds.tsv')
unr_rna_meta = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'eqtl_input', 'peer_10_residuals.tsv')
residuals = pd.read_table(fn, index_col=0).T
In [4]:
residuals = residuals[gene_info.ix[residuals.index, 'chrom'].apply(lambda x: x not in ['chrX', 'chrY'])]
In [5]:
fn = os.path.join(ciepy.root, 'output', 'input_data',
'mbased_major_allele_freq.tsv')
maj_af = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data',
'mbased_p_val_ase.tsv')
ase_pval = pd.read_table(fn, index_col=0)
ase_pval = ase_pval[rna_meta[rna_meta.in_eqtl].index]
ase_pval.columns = rna_meta[rna_meta.in_eqtl].wgs_id
locus_p = pd.Panel({'major_allele_freq':maj_af, 'p_val_ase':ase_pval})
locus_p = locus_p.swapaxes(0, 2)
In [6]:
fn = os.path.join(ciepy.root, 'private_output',
'rare_variant_processing', 'promoter_dhs_variants.pickle')
variants = cPickle.load(open(fn))
fn = os.path.join(ciepy.root, 'private_output',
'rare_variant_processing', 'promoter_dhs_genotypes.tsv')
genotypes = pd.read_table(fn, index_col=0)
# fn = os.path.join(ciepy.root, 'private_output',
# 'rare_variant_processing', 'tf_disruption.tsv')
# tf_disruption = pd.read_table(fn, index_col=0)
In [7]:
unrelateds = unr_rna_meta
I'll remove variants whose minor allele wasn't in the unrelateds or that weren't called in at least 100 of the unrelateds.
In [8]:
variants_f = variants[variants.unr_mac > 0]
variants_f = variants_f[variants_f.unr_called >= 100]
variants_f.shape
Out[8]:
I'm trying to decide on the definition of rare. There are 284,025 promoter DHS variants. Here are the number of 1,000 Genomes populations below 0.5% for each variant:
In [9]:
(variants_f[[x for x in variants_f.columns if '_AF' in x]] < 0.005).sum(axis=1).value_counts()
Out[9]:
This is what it looks like if we remove variants that aren't in 1KGP.
In [10]:
tdf = variants_f[variants_f.AF != 0]
(tdf[[x for x in tdf.columns if '_AF' in x]] < 0.005).sum(axis=1).value_counts()
Out[10]:
Let's keep variants with MAF less than 0.5% in 1KGP.
In [11]:
variants_f = variants_f[variants_f.AF <= 0.005]
variants_f.shape
Out[11]:
This is the number of alleles observed in my unrelateds:
In [12]:
variants_f.unr_mac.value_counts().head(10)
Out[12]:
And this is the minor allele frequency:
In [13]:
variants_f['unr_maf'] = (variants_f.unr_mac / (variants_f.unr_called * 2.))
variants_f.unr_maf.hist(bins=np.arange(0, 0.51, 0.01))
plt.ylabel('Number of variants')
plt.xlabel('Minor allele frequency');
There are some variants that I see at a higher frequency but most are rare. I'm going to keep things whose minor allele count is one.
In [14]:
variants_f = variants_f[variants_f.unr_mac == 1]
variants_f.shape
Out[14]:
In [15]:
sum([len(x & set(residuals.index)) > 0 for x in variants_f.genes])
Out[15]:
In [16]:
variants_f.max_phyloP100way.hist(bins=np.arange(-20, 10.1, 0.1))
plt.xlabel('Max phyloP score')
plt.ylabel('Number of variants');
In [17]:
variants_f.cadd_phred.hist(bins=np.arange(0, 51))
plt.ylabel('Number of variants')
plt.xlabel('CADD PHRED score');
In [18]:
def stratify_on_rare(variants, genotypes, exp, ase_pvals):
has_rare = pd.DataFrame(False, index=gene_info.index, columns=unrelateds.wgs_id)
cadd = pd.DataFrame(np.nan, index=gene_info.index, columns=unrelateds.wgs_id)
phylop = pd.DataFrame(np.nan, index=gene_info.index, columns=unrelateds.wgs_id)
for i,ind in enumerate(variants.index):
se = genotypes.ix[ind, unrelateds.wgs_id]
mode = se.mode().values[0]
se = se.fillna(mode)
rare_samples = se[se != mode].index
not_rare_samples = se[se == mode].index
has_rare.ix[variants.ix[i, 'genes'], rare_samples] = True
for g in variants.ix[i, 'genes']:
cadd.ix[g, rare_samples] = \
pd.DataFrame([cadd.ix[g, rare_samples].values,
variants.ix[i, 'cadd_phred'] * np.ones(len(rare_samples))]).max().values
phylop.ix[g, rare_samples] = \
pd.DataFrame([phylop.ix[g, rare_samples].values,
variants.ix[i, 'max_phyloP100way'] * np.ones(len(rare_samples))]).max().values
has_rare_f = has_rare.ix[exp.index]
cadd_f = cadd.ix[has_rare_f.index]
phylop_f = phylop.ix[has_rare_f.index]
has_rare_f = has_rare_f.stack()
cadd_f = cadd_f.stack()
phylop_f = phylop_f.stack()
sz = exp.stack()
ases = ase_pvals.stack()
rare_info = pd.DataFrame({'exp':sz.ix[has_rare_f[has_rare_f].index],
'cadd':cadd_f.ix[has_rare_f[has_rare_f].index],
'phylop':phylop_f.ix[has_rare_f[has_rare_f].index]})
rare_info['ase_pval'] = ases.ix[rare_info.index]
not_rare_info = pd.DataFrame({'exp':sz.ix[has_rare_f[has_rare_f == False].index]})
not_rare_info['ase_pval'] = ases.ix[not_rare_info.index]
return rare_info, not_rare_info
def summarize_stratification(rare_info, not_rare_info):
r = sum(rare_info.ase_pval.dropna() < 0.005) / float(rare_info.ase_pval.dropna().shape[0])
nr = sum(not_rare_info.ase_pval.dropna() < 0.005) / float(not_rare_info.ase_pval.dropna().shape[0])
a = (rare_info.ase_pval.dropna() < 0.005).value_counts()
b = (not_rare_info.ase_pval.dropna() < 0.005).value_counts()
odds, p = stats.fisher_exact([[a[True], a[False]], [b[True], b[False]]])
print('Gene/samples with rare promoter DHS variant have ASE {:.2f}% of the '
'time while gene/samples without rare promoter DHS variant have ASE '
'{:.2f}% of the time (odds={:.2f}, p={:.2e}, Fisher exact).\n'.format(r * 100, nr * 100, odds, p))
s,p = stats.mannwhitneyu(rare_info.exp, not_rare_info.exp)
print('Mann Whitney U for distributions of expression values for genes with and without '
'rare promoter DHS variants: p={:.2e}.\n'.format(p))
s,p = stats.mannwhitneyu(rare_info.exp.abs(), not_rare_info.exp.abs())
print('Mann Whitney U for distributions of expression magnitudes for genes with and without '
'rare promoter DHS variants: p={:.2e}.\n'.format(p))
weights = np.ones_like(not_rare_info.exp) / float(not_rare_info.shape[0])
m = np.floor(min(not_rare_info.exp.min(), rare_info.exp.min()))
not_rare_info.exp.hist(bins=np.arange(m, abs(m) + 0.2, 0.2),
alpha=0.5, histtype='stepfilled',
label='not rare $n={}$'.format(not_rare_info.shape[0]),
weights=weights)
weights = np.ones_like(rare_info.exp) / float(rare_info.shape[0])
rare_info.exp.hist(bins=np.arange(m, abs(m) + 0.2, 0.2),
alpha=0.5, histtype='stepfilled',
label='rare $n={}$'.format(rare_info.shape[0]),
weights=weights)
plt.legend()
plt.xlabel('Expression')
plt.ylabel('Fraction of gene/sample pairs');
def make_pdfs_cdfs(rare_info, not_rare_info):
# Make PDFs
m = np.floor(rare_info.exp.min())
pdfs = pd.DataFrame(index=np.arange(m, abs(m) + 0.1, 0.1))
density = scipy.stats.gaussian_kde(rare_info.exp)
pdfs['rare'] = density(pdfs.index)
density = scipy.stats.gaussian_kde(not_rare_info.exp)
pdfs['not_rare'] = density(pdfs.index)
fig,axs = plt.subplots(1, 2, figsize=(10, 4))
ax = axs[0]
# Plot PDFs
pdfs.rare.plot(label='rare', ax=ax)
pdfs.not_rare.plot(label='not rare', ax=ax)
ax.legend()
ax.set_ylabel('Fraction of genes')
ax.set_xlabel('Expression')
ax = axs[1]
(pdfs.rare - pdfs.not_rare).plot(ax=ax)
ax.set_ylabel('$\Delta$(rare - not rare) fraction of genes')
ax.set_xlabel('Expression')
fig.tight_layout()
# Make CDFs
cdfs = pd.DataFrame(index=pdfs.index)
area = [0]
for i in range(1, cdfs.shape[0]):
area.append(0.5 * (pdfs['rare'].loc[pdfs.index[i - 1]] + pdfs['rare'][pdfs.index[i]]) * 0.1)
cdfs['rare'] = pd.Series(area, index=cdfs.index).cumsum()
area = [0]
for i in range(1, cdfs.shape[0]):
area.append(0.5 * (pdfs['not_rare'].loc[pdfs.index[i - 1]] + pdfs['not_rare'][pdfs.index[i]]) * 0.1)
cdfs['not_rare'] = pd.Series(area, index=cdfs.index).cumsum()
# Plot CDFs
cdfs.plot()
plt.xlabel('Expression')
plt.ylabel('Probability');
return pdfs, cdfs
In [19]:
residuals_c = (residuals.T - residuals.mean(axis=1)).T
residuals_z = (residuals_c.T / residuals_c.std(axis=1)).T
In [20]:
std = residuals.std(axis=1)
std.sort_values(inplace=True, ascending=False)
rc = residuals_z.ix[std.head(500).index, unrelateds.wgs_id].corr()
sns.heatmap(rc, xticklabels=[], yticklabels=[])
plt.title('Sample correlation');
The samples aren't really correlated in any meaningful way after PEER, so I think I can use samples from different ancestries.
In [21]:
a = os.path.join(private_outdir, 'all_rare_variants_info.tsv')
b = os.path.join(private_outdir, 'all_not_rare_variants_info.tsv.gz')
if not os.path.exists(a) and not os.path.exists(b):
all_rare_info, all_not_rare_info = stratify_on_rare(
variants_f,
genotypes[unrelateds.wgs_id],
residuals_z[unrelateds.wgs_id],
ase_pval[unrelateds.wgs_id])
all_rare_info.to_csv(a, sep='\t')
all_not_rare_info.to_csv(b, sep='\t', compression='gzip')
else:
all_rare_info = pd.read_table(a, index_col=0)
all_not_rare_info = pd.read_table(b, index_col=0)
In [22]:
(all_rare_info.shape[0] + all_not_rare_info.shape[0]) / 131.
Out[22]:
In [23]:
len(set(all_rare_info.index) | set(all_not_rare_info.index))
Out[23]:
In [24]:
robust_exp = pd.Series(False, index=gene_info.index)
robust_exp.ix[residuals_z.index] = True
tt = [sum(robust_exp[x]) > 0 for x in variants_f.genes]
print('Found {:,} distinct rpdSNVs.'.format(sum(tt)))
In [25]:
n = len(set([x[0] for x in all_rare_info.index]))
print('{:,} of {:,} robustly expressed genes have at least one rpdSNV.'.format(n, residuals_z.shape[0]))
Note that I use all gene expression values, even if a particular gene didn't have an rpdSNV in any sample.
In [58]:
n = all_rare_info.shape[0]
print('{:,} gene expression estimates from gene/samples with an rpdSNV.'.format(n))
In [60]:
n = all_not_rare_info.shape[0]
print('{:,} gene expression estimates from gene/samples without an rpdSNV.'.format(n))
In [28]:
all_rare_info.shape[0] + all_not_rare_info.shape[0]
Out[28]:
In [63]:
8.76e-04
Out[63]:
In [30]:
summarize_stratification(all_rare_info, all_not_rare_info)
In [31]:
all_pdfs,all_cdfs = make_pdfs_cdfs(all_rare_info, all_not_rare_info)
In [32]:
vc = pd.Series([x[1] for x in all_rare_info.index]).value_counts()
c = pd.Series(['red', 'blue'], index=['Blood', 'Fibroblast'])[wgs_meta.ix[vc.index, 'cell']]
vc.plot(kind='bar', color=c)
plt.xticks([]);
In [33]:
stats.spearmanr(all_rare_info.exp.abs(), all_rare_info.cadd)
Out[33]:
In [34]:
stats.spearmanr(all_rare_info.exp.abs(), all_rare_info.phylop)
Out[34]:
In [35]:
summarize_stratification(all_rare_info[all_rare_info.cadd >= 10], all_not_rare_info)
In [36]:
all_pdfs,all_cdfs = make_pdfs_cdfs(all_rare_info[all_rare_info.cadd >= 10], all_not_rare_info)
In [37]:
summarize_stratification(all_rare_info[all_rare_info.cadd >= 20], all_not_rare_info)
In [38]:
all_pdfs,all_cdfs = make_pdfs_cdfs(all_rare_info[all_rare_info.cadd >= 20], all_not_rare_info)
In [39]:
summarize_stratification(all_rare_info[all_rare_info.phylop >= 1], all_not_rare_info)
In [40]:
all_pdfs,all_cdfs = make_pdfs_cdfs(all_rare_info[all_rare_info.phylop >= 1], all_not_rare_info)
In [41]:
summarize_stratification(all_rare_info[all_rare_info.phylop >= 3], all_not_rare_info)
In [42]:
all_pdfs,all_cdfs = make_pdfs_cdfs(all_rare_info[all_rare_info.phylop >= 3], all_not_rare_info)
In [43]:
fn = os.path.join(ciepy.root, 'private_output', 'rare_variant_processing', 'rare_cnvs.pickle')
rare_cnv_info = cPickle.load(open(fn))
In [44]:
def stratify_on_rare_cnvs(variants, exp, ase_pvals):
has_rare = pd.DataFrame(False, index=gene_info.index, columns=unrelateds.wgs_id)
contains_gene = pd.DataFrame(False, index=gene_info.index, columns=unrelateds.wgs_id)
overlaps_exon = pd.DataFrame(False, index=gene_info.index, columns=unrelateds.wgs_id)
for i,ind in enumerate(variants.index):
has_rare.ix[variants.ix[i, 'overlaps_gene'], variants.ix[i, 'sample']] = True
if variants.ix[i, 'contains_gene'] > np.nan:
contains_gene.ix[variants.ix[i, 'contains_gene'], variants.ix[i, 'sample']] = True
if variants.ix[i, 'overlaps_gene_exon'] > np.nan:
overlaps_exon.ix[variants.ix[i, 'overlaps_gene_exon'], variants.ix[i, 'sample']] = True
has_rare_f = has_rare.ix[exp.index]
contains_gene_f = contains_gene.ix[has_rare_f.index]
overlaps_exon_f = overlaps_exon.ix[has_rare_f.index]
has_rare_f = has_rare_f.stack()
contains_gene_f = contains_gene_f.stack()
overlaps_exon_f = overlaps_exon_f.stack()
sz = exp.stack()
ases = ase_pvals.stack()
rare_info = pd.DataFrame({'exp':sz.ix[has_rare_f[has_rare_f].index],
'contains_gene':contains_gene_f.ix[has_rare_f[has_rare_f].index],
'overlaps_exon':overlaps_exon_f.ix[has_rare_f[has_rare_f].index]})
rare_info['ase_pval'] = ases.ix[rare_info.index]
not_rare_info = pd.DataFrame({'exp':sz.ix[has_rare_f[has_rare_f == False].index]})
not_rare_info['ase_pval'] = ases.ix[not_rare_info.index]
return rare_info, not_rare_info
In [45]:
rare_del, not_rare_del = stratify_on_rare_cnvs(
rare_cnv_info[rare_cnv_info.svtype == 'DEL'], residuals_z, ase_pval)
In [46]:
rare_dup, not_rare_dup = stratify_on_rare_cnvs(
rare_cnv_info[rare_cnv_info.svtype == 'DUP'], residuals_z, ase_pval)
In [66]:
a = rare_del.overlaps_exon.sum()
b = rare_del.shape[0]
print('{} of {} rare deletions overlap exons.'.format(a, b))
In [48]:
summarize_stratification(rare_del, not_rare_del)
In [49]:
all_pdfs,all_cdfs = make_pdfs_cdfs(rare_del, not_rare_del)
In [50]:
summarize_stratification(rare_del[rare_del.overlaps_exon], not_rare_del)
In [51]:
all_pdfs,all_cdfs = make_pdfs_cdfs(rare_del[rare_del.overlaps_exon], not_rare_del)
In [52]:
a = rare_dup.overlaps_exon.sum()
b = rare_dup.shape[0]
print('{} of {} rare duplications overlap exons.'.format(a, b))
In [53]:
summarize_stratification(rare_dup, not_rare_dup)
In [54]:
all_pdfs,all_cdfs = make_pdfs_cdfs(rare_dup, not_rare_dup)
In [55]:
summarize_stratification(rare_dup[rare_dup.overlaps_exon], not_rare_dup)
In [56]:
all_pdfs,all_cdfs = make_pdfs_cdfs(rare_dup[rare_dup.overlaps_exon], not_rare_dup)
In [57]:
rare_del.to_csv(os.path.join(private_outdir, 'rare_del.tsv'), sep='\t')
not_rare_del.to_csv(os.path.join(private_outdir, 'not_rare_del.tsv'), sep='\t')
rare_dup.to_csv(os.path.join(private_outdir, 'rare_dup.tsv'), sep='\t')
not_rare_dup.to_csv(os.path.join(private_outdir, 'not_rare_dup.tsv'), sep='\t')
In [81]:
s,p = stats.mannwhitneyu(rare_dup.ix[rare_dup.overlaps_exon, 'exp'],
rare_dup.ix[rare_dup.overlaps_exon == False, 'exp'])
print('The distribution of expression values between rare duplications that overlap '
'exons and rare dups that don\'t overlap exons is significantly different '
'(p={:.4f}, Mann Whitney U).'.format(p))
In [79]:
s,p = stats.mannwhitneyu(rare_del.ix[rare_del.overlaps_exon, 'exp'],
rare_del.ix[rare_del.overlaps_exon == False, 'exp'])
print('The distribution of expression values between rare deletions that overlap '
'exons and rare dels that don\'t overlap exons is significantly different '
'(p={:.3e}, Mann Whitney U).'.format(p))
In [82]:
rare_del.head()
Out[82]:
In [93]:
a = rare_del[rare_del.contains_gene & rare_del.overlaps_exon]
b = rare_del[(rare_del.contains_gene == False) & rare_del.overlaps_exon]
In [101]:
b.exp.hist()
a.exp.hist()
Out[101]:
In [92]:
pd.crosstab(rare_del.contains_gene, rare_del.overlaps_exon)
Out[92]:
In [91]:
rare_cnv_info.head()
Out[91]:
In [112]:
stats.norm.ppf(0.975)
Out[112]:
In [109]:
stats.norm.ppf?
In [107]:
stats.norm.cdf(1.67)
Out[107]:
In [ ]:
pvals = pd.DataFrame(stats.norm.cdf(tdf), index=tdf.index, columns=tdf.columns)